In [1]:
    
import numpy as np
import pandas as pd
import scipy as sp
import lightgbm as lgb
import gc
    
In [5]:
    
import matplotlib.pyplot as plt
%matplotlib inline
    
In [2]:
    
from sklearn.model_selection import train_test_split
    
In [42]:
    
# load the lightgbm model
bst1 = lgb.Booster(model_file='../models/lightGBM_morefeatures_80_10.txt')
    
In [43]:
    
plt.rcParams["figure.figsize"] = 9, 7
    
In [44]:
    
lgb.plot_importance(bst1, max_num_features= 30)
    
    Out[44]:
    
In [45]:
    
prob_preds_df = pd.read_csv("../data/processed/lightGBM_morefeatures_prob_preds.csv")
    
In [46]:
    
prob_preds_df.head()
    
    Out[46]:
In [47]:
    
prob_preds_df.shape
    
    Out[47]:
In [60]:
    
ax = prob_preds_df['pred'].plot.hist(bins = 30)
ax.set_xlabel("probability")
    
    Out[60]:
    
In [48]:
    
def generate_submission(df_test, test_orders_ids, file_name, threshold = 0.2, single_thres = True):
    """function to generate label predictions submission format"""
    if single_thres:
        TRESHOLD = threshold
        d = dict()
        for row in df_test.itertuples():
            if row.pred > TRESHOLD:
                try:
                    d[row.order_id] += ' ' + str(row.product_id)
                except:
                    d[row.order_id] = str(row.product_id)
        for order in test_orders_ids:
            if order not in d:
                d[order] = 'None'
        sub = pd.DataFrame.from_dict(d, orient='index')
        sub.reset_index(inplace=True)
        sub.columns = ['order_id', 'products']
        sub.to_csv(file_name, index=False)
    else:
        pass
    
In [49]:
    
orders_df = pd.read_csv("../data/raw/orders.csv")
    
In [50]:
    
test_orders_ids = orders_df[orders_df.eval_set == "test"].order_id
    
In [51]:
    
generate_submission(prob_preds_df, test_orders_ids, "../models/lightGBM_morefeatures_preds_30%thr.csv", threshold = 0.3, single_thres = True)
    
In [54]:
    
generate_submission(prob_preds_df, test_orders_ids, "../models/lightGBM_morefeatures_preds_25%thr.csv", threshold = 0.25, single_thres = True)
    
In [55]:
    
generate_submission(prob_preds_df, test_orders_ids, "../models/lightGBM_morefeatures_preds_15%thr.csv", threshold = 0.15, single_thres = True)
    
In [56]:
    
generate_submission(prob_preds_df, test_orders_ids, "../models/lightGBM_morefeatures_preds_10%thr.csv", threshold = 0.1, single_thres = True)
    
In [ ]: